Construct data frame
load("XSTSF_production.RData")
source('functions.R')
# add human perceptual sandhi categories
label_sandhi <- read.csv('raw_data/sandhi_label.csv',
na.strings = '')
f0_all_pre_label <- f0_all_pre %>%
select(!sandhi_tone) %>%
left_join(label_sandhi[, c('ind_no', 'sandhi_tone', 'sandhi_tone_var', 'diortri')],
by = c('diortri', 'ind_no')) %>%
mutate(sandhi_tone_var = case_when(is.na(sandhi_tone_var) == TRUE ~ sandhi_tone,
.default = sandhi_tone_var)) %>%
rename(normtime = time)
# get disyllabic citation data
f0_di <- f0_all_pre_label %>% filter(diortri == 'di')
f0_di_ct <- f0_di %>% filter(focus_condition == 'ct') %>%
# re-normalisation
group_by(speaker) %>%
mutate(f0ref = mean(f0, na.rm = T),
norm_f0 = scale(log(f0))) %>%
ungroup()
# get individual datasets
f0_di_ct_lcmh <- f0_di_ct %>% filter(grepl("^[LM]", syntax_iniTone)) %>%
mutate(sandhi_tone = case_when(sandhi_tone == 'HLLM' ~ 'HMML',
sandhi_tone == 'LLHL' ~ 'LLRF',
.default = sandhi_tone))
# check manual labels
unique(f0_di_ct_lcmh$sandhi_tone)
## [1] "HMML" "MHHL" "LLLM" "LLRF" "LMML" NA "MMMH"
## [8] "HHHH" "outlier"
f0_di_ct_lcmh_h <- f0_di_ct_lcmh %>% filter( grepl('^H', mono_tone_1))
f0_di_ct_lcmh_hp <- f0_di_ct_lcmh_h %>% filter(grepl('p$', syntax_iniTone))
f0_di_ct_lcmh_hs <- f0_di_ct_lcmh_h %>% filter(grepl('s$', syntax_iniTone))
f0_di_ct_lcmh_l <- f0_di_ct_lcmh %>% filter( grepl('^[LR]', mono_tone_1))
f0_di_ct_lcmh_lp <- f0_di_ct_lcmh_l %>% filter(grepl('p$', syntax_iniTone))
f0_di_ct_lcmh_ls <- f0_di_ct_lcmh_l %>% filter(grepl('s$', syntax_iniTone))
# yinping-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_hp, 'speaker'), tooltip = c('text', 'x'))
# yinshang-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_hs, 'speaker'), tooltip = c('text', 'x'))
# yangping-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_lp, 'speaker'), tooltip = c('text', 'x'))
# yangshang-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_ls, 'speaker'), tooltip = c('text', 'x'))
f0_di_ct_lcmh_h <- f0_di_ct_lcmh_h %>%
mutate(sandhi_tone = ifelse(sandhi_tone == 'HLLM', 'HMML', sandhi_tone),
propdur = as.integer(normtime)/20) %>%
unite('groupvar', ind_no, syllable_no, sep = '_', remove = FALSE) %>%
filter(is.na(sandhi_tone) == FALSE) %>%
filter(!ind_no %in% c('S2_1_ct', 'S2_11_ct', 'S2_27_ct', 'S3_5_ct', 'S3_19_ct', 'S5_27_ct'))
unique(f0_di_ct_lcmh_h$sandhi_tone) # check the labels
## [1] "HMML" "MHHL" "MMMH" "HHHH"
p_cluster(f0_di_ct_lcmh_h, sandhi_tone)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
p_cluster(f0_di_ct_lcmh_h, sandhi_tone, 'speaker')
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# examine individual cases
f0_di_ct_lcmh_h %>%
filter(sandhi_tone == 'HMML' & speaker == 'S2') %>%
select('token', 'groupvar') %>% distinct()
## # A tibble: 2 × 2
## token groupvar
## <chr> <chr>
## 1 新路 S2_51_ct_1
## 2 新路 S2_51_ct_2
H-register LC & MH
f0_di_ct_lcmh_h_kmeans <- f0_di_ct_lcmh_h %>%
select(-diortri, -syllable_no, -focus_no, -f0, -groupvar, -propdur) %>%
spread(normtime, norm_f0)
f0_di_ct_lcmh_h_cluster <- k_means(f0_di_ct_lcmh_h_kmeans)
kml(f0_di_ct_lcmh_h_cluster, nbClusters = 2:10)
## ~ Fast KmL ~
## ***************************************************************************************************S
## 100 ********************************************************************************S
plot(f0_di_ct_lcmh_h_cluster, 4, parTraj=parTRAJ(col="clusters"))
f0_di_ct_lcmh_h_kmeans <- f0_di_ct_lcmh_h_kmeans %>%
mutate(cluster4 = getClusters(f0_di_ct_lcmh_h_cluster, 4))
cluster_solution <- get_cluster_solution(f0_di_ct_lcmh_h_kmeans)
compare_cluster(cluster_solution, 'cluster4')
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
# examine mismathes
f0_di_ct_lcmh_h_kmeans %>% filter(sandhi_tone == 'HMML' & cluster4 == 'C')
## # A tibble: 13 × 35
## speaker token focus_condition citation_no ind_no mono_tone_1 mono_tone_2
## <fct> <chr> <chr> <fct> <chr> <chr> <chr>
## 1 S1 青椒 ct 1 S1_1_ct HHp HHp
## 2 S1 书包 ct 3 S1_3_ct HHp HHp
## 3 S2 新路 ct 51 S2_51_ct HHp LHq
## 4 S5 新车 ct 45 S5_45_ct HHp HHp
## 5 S7 樱花 ct 9 S7_9_ct HHp HHp
## 6 S7 新车 ct 45 S7_45_ct HLp HHp
## 7 S7 新路 ct 51 S7_51_ct HLp LHq
## 8 S8 书包 ct 3 S8_3_ct HHp HHp
## 9 S8 樱花 ct 9 S8_9_ct HLp HHp
## 10 S8 青菜 ct 11 S8_11_ct HLp HLq
## 11 S8 樱桃 ct 17 S8_17_ct HLp RFp
## 12 S8 书房 ct 21 S8_21_ct HHp RFp
## 13 S8 青豆 ct 25 S8_25_ct HLp LHq
## # ℹ 28 more variables: mono_tone_3 <chr>, citation_tone <chr>, syntax <chr>,
## # syntax_iniTone <chr>, sandhi_tone <chr>, sandhi_tone_var <chr>,
## # f0ref <dbl>, `1` <dbl>, `2` <dbl>, `3` <dbl>, `4` <dbl>, `5` <dbl>,
## # `6` <dbl>, `7` <dbl>, `8` <dbl>, `9` <dbl>, `10` <dbl>, `11` <dbl>,
## # `12` <dbl>, `13` <dbl>, `14` <dbl>, `15` <dbl>, `16` <dbl>, `17` <dbl>,
## # `18` <dbl>, `19` <dbl>, `20` <dbl>, cluster4 <fct>
try doing k-means for the whole disyllabic citation dataset
f0_di_ct_lcmh_kmeans <- f0_di_ct_lcmh %>%
filter(!sandhi_tone %in% c(NA, 'outlier')) %>%
select(-diortri, -syllable_no, -focus_no, -f0) %>%
spread(normtime, norm_f0)
f0_di_ct_lcmh_cluster <- k_means(f0_di_ct_lcmh_kmeans)
kml(f0_di_ct_lcmh_cluster, nbClusters = 2:10)
## ~ Fast KmL ~
## ***************************************************************************************************S
## 100 ********************************************************************************S
plot(f0_di_ct_lcmh_cluster, 7, parTraj=parTRAJ(col="clusters"))
f0_di_ct_lcmh_kmeans <- f0_di_ct_lcmh_kmeans %>%
mutate(cluster7 = getClusters(f0_di_ct_lcmh_cluster, 7))
Mapping between human and machine clustering
cluster_solution <- get_cluster_solution(f0_di_ct_lcmh_kmeans)
compare_cluster(cluster_solution, 'cluster7')
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
## Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.